# Configuration file for the Sniper simulator

# This file is organized into sections defined in [] brackets as in [section].
# Sections may be hierarchical withsub-sections split by the '/' character as
# in [section/sub_section].
#
# values can be "strings" , numbers, or true/false, existing values
# should indicate the type

# This section controls various high-level simulation parameters.
[general]
magic = false # Enable performance simulation straight away (false), or wait for Roi{Begin,End} magic instruction (true)
roi_script = false # Allow ROI to be set by a script, and ignore Roi{Begin,End} magic instructions
inst_mode_init = cache_only
inst_mode_roi = detailed
inst_mode_end = fast_forward
inst_mode_output = true
syntax = intel # Disassembly syntax (intel, att or xed)
issue_memops_at_functional = false # Issue memory operations to the memory hierarchy as they are executed functionally (Pin front-end only)
num_host_cores = 0 # Number of host cores to use (approximately). 0 = autodetect based on available cores and cpu mask. -1 = no limit (oversubscribe)
enable_signals = false
enable_smc_support = false # Support self-modifying code
enable_pinplay = false # Run with a pinball instead of an application (requires a Pin kit with PinPlay support)
enable_syscall_emulation = true # Emulate system calls, cpuid, rdtsc, etc. (disable when replaying Pinballs)
suppress_stdout = false # Suppress the application's output to stdout
suppress_stderr = false # Suppress the application's output to stderr

# Total number of cores in the simulation
total_cores = 64

enable_icache_modeling = false

# This section is used to fine-tune the logging information. The logging may
# be disabled for performance runs or enabled for debugging.
[log]
enabled = false
stack_trace = false
disabled_modules = ""
enabled_modules = ""
mutex_trace = false
pin_codecache_trace = false
circular_log = false

[progress_trace]
enabled = false
interval = 5000
filename = ""

[clock_skew_minimization]
scheme = barrier
report = false

[clock_skew_minimization/barrier]
quantum = 100                         # Synchronize after every quantum (ns)

# This section describes parameters for the core model
[perf_model/core]
frequency = 1        # In GHz
type = oneipc        # Valid models are oneipc, interval, rob
logical_cpus = 1     # Number of SMT threads per core

[perf_model/core/interval_timer]
#dispatch_width = 4
#window_size = 96
issue_contention = true
num_outstanding_loadstores = 8
memory_dependency_granularity = 8 # In bytes
lll_dependency_granularity = 64 # In bytes. Model the MSHR for overlapping misses by adding additional dependencies on long-latency loads using cache-line granularity
lll_cutoff = 30
issue_memops_at_dispatch = false # Issue memory operations to the cache hierarchy at dispatch (true) or at fetch (false)

# This section describes the number of cycles for
# various arithmetic instructions.
[perf_model/core/static_instruction_costs]
add=1
sub=1
mul=3
div=18
fadd=3
fsub=3
fmul=5
fdiv=6
generic=1
jmp=1
string=1
branch=1
dynamic_misc=1
recv=1
sync=0
spawn=0
tlb_miss=0
mem_access=0
delay=0
unknown=0

[perf_model/branch_predictor]
type=one_bit
mispredict_penalty=14 # A guess based on Penryn pipeline depth
size=1024

[perf_model/tlb]
# Penalty of a page walk (in cycles)
penalty = 0
# Page walk is done by separate hardware in parallel to other core activity (true),
# or by the core itself using a serializing instruction (false, e.g. microcode or OS)
penalty_parallel = true

[perf_model/itlb]
size = 0              # Number of I-TLB entries
associativity = 1     # I-TLB associativity

[perf_model/dtlb]
size = 0              # Number of D-TLB entries
associativity = 1     # D-TLB associativity

[perf_model/stlb]
size = 0              # Number of second-level TLB entries
associativity = 1     # S-TLB associativity

[perf_model/l1_icache]
perfect = false
passthrough = false
coherent = true
cache_block_size = 64
cache_size = 32 # in KB
associativity = 4
address_hash = mask
replacement_policy = lru
data_access_time = 3
tags_access_time = 1
perf_model_type = parallel
writeback_time = 0    # Extra time required to write back data to a higher cache level
dvfs_domain = core    # Clock domain: core or global
shared_cores = 1      # Number of cores sharing this cache
next_level_read_bandwidth = 0 # Read bandwidth to next-level cache, in bits/cycle, 0 = infinite
prefetcher = none

[perf_model/l1_dcache]
perfect = false
passthrough = false
cache_block_size = 64
cache_size = 32 # in KB
associativity = 4
address_hash = mask
replacement_policy = lru
data_access_time = 3
tags_access_time = 1
perf_model_type = parallel
writeback_time = 0    # Extra time required to write back data to a higher cache level
dvfs_domain = core    # Clock domain: core or global
shared_cores = 1      # Number of cores sharing this cache
outstanding_misses = 0
next_level_read_bandwidth = 0 # Read bandwidth to next-level cache, in bits/cycle, 0 = infinite
prefetcher = none

[perf_model/l2_cache]
perfect = false
passthrough = false
cache_block_size = 64 # in bytes
cache_size = 512 # in KB
associativity = 8
address_hash = mask
replacement_policy = lru
data_access_time = 9
tags_access_time = 3  # This is just a guess for Penryn
perf_model_type = parallel
writeback_time = 0    # Extra time required to write back data to a higher cache level
dvfs_domain = core    # Clock domain: core or global
shared_cores = 1      # Number of cores sharing this cache
prefetcher = none     # Prefetcher type
next_level_read_bandwidth = 0 # Read bandwidth to next-level cache, in bits/cycle, 0 = infinite

[perf_model/l3_cache]
perfect = false
passthrough = false

[perf_model/l4_cache]
perfect = false
passthrough = false

[perf_model/llc]
evict_buffers = 8

[perf_model/fast_forward]
model = oneipc        # Performance model during fast-forward (none, oneipc)

[perf_model/fast_forward/oneipc]
interval = 100000     # Barrier quantum in fast-forward, in ns
include_memory_latency = false # Increment time by memory latency
include_branch_misprediction = false # Increment time on branch misprediction

[core]
spin_loop_detection = false

[core/light_cache]
num = 0

[core/cheetah]
enabled = false
min_size_bits = 10
max_size_bits_local = 30
max_size_bits_global = 36

[core/hook_periodic_ins]
ins_per_core = 10000  # After how many instructions should each core increment the global HPI counter
ins_global = 1000000  # Aggregate number of instructions between HOOK_PERIODIC_INS callbacks

[caching_protocol]
type = parametric_dram_directory_msi
variant = mesi                            # msi, mesi or mesif

[perf_model/dram_directory]
total_entries = 16384
associativity = 16
max_hw_sharers = 64                       # number of sharers supported in hardware (ignored if directory_type = full_map)
directory_type = full_map                 # Supported (full_map, limited_no_broadcast, limitless)
home_lookup_param = 6                     # Granularity at which the directory is stripped across different cores
directory_cache_access_time = 10          # Tag directory lookup time (in cycles)
locations = dram                          # dram: at each DRAM controller, llc: at master cache locations, interleaved: every N cores (see below)
interleaving = 1                          # N when locations=interleaved

[perf_model/dram_directory/limitless]
software_trap_penalty = 200               # number of cycles added to clock when trapping into software (pulled number from Chaiken papers, which explores 25-150 cycle penalties)

[perf_model/dram]
type = constant                           # DRAM performance model type: "constant" or a "normal" distribution
latency = 100                             # In nanoseconds
per_controller_bandwidth = 5              # In GB/s
num_controllers = -1                      # Total Bandwidth = per_controller_bandwidth * num_controllers
controllers_interleaving = 0              # If num_controllers == -1, place a DRAM controller every N cores
controller_positions = ""
direct_access = false                     # Access DRAM controller directly from last-level cache (only when there is a single LLC)

[perf_model/dram/normal]
standard_deviation = 0                    # The standard deviation, in nanoseconds, of the normal distribution

[perf_model/dram/cache]
enabled = false

[perf_model/dram/queue_model]
enabled = true
type = history_list

[perf_model/nuca]
enabled = false

[perf_model/sync]
reschedule_cost = 0 # In nanoseconds

# This describes the various models used for the different networks on the core
[network]
# Valid Networks :
# 1) magic
# 2) emesh_hop_counter, emesh_hop_by_hop
# 3) bus
memory_model_1 = emesh_hop_counter
system_model = magic
collect_traffic_matrix = false

[network/emesh_hop_counter]
link_bandwidth = 64 # In bits/cycles
hop_latency = 2

[network/emesh_hop_by_hop]
link_bandwidth = 64   # In bits/cycle
hop_latency = 2       # In cycles
concentration = 1     # Number of cores per network stop
dimensions = 2        # Dimensions (1 for line/ring, 2 for 2-D mesh/torus)
wrap_around = false   # Use wrap-around links (false for line/mesh, true for ring/torus)
size = ""             # ":"-separated list of size for each dimension, default = auto

[network/emesh_hop_by_hop/queue_model]
enabled = true
type = history_list
[network/emesh_hop_by_hop/broadcast_tree]
enabled = false

[network/bus]
ignore_local_traffic = true # Do not count traffic between core and directory on the same tile

[network/bus/queue_model]
type=contention

[queue_model/basic]
moving_avg_enabled = true
moving_avg_window_size = 1024
moving_avg_type = arithmetic_mean

[queue_model/history_list]
# Uses the analytical model (if enabled) to calculate delay if cannot be calculated using the history list
max_list_size = 100
analytical_model_enabled = true

[queue_model/windowed_mg1]
window_size = 1000        # In ns. A few times the barrier quantum should be a good choice

[dvfs]
type = simple
transition_latency = 0 # In nanoseconds

[dvfs/simple]
cores_per_socket = 1

[bbv]
sampling = 0 # Defines N to skip X samples with X uniformely distributed between 0..2*N, so on average 1/N samples

[loop_tracer]
#base_address = 0 # Start address in hex (without 0x)
iter_start = 0
iter_count = 36

[osemu]
pthread_replace = false   # Emulate pthread_{mutex|cond|barrier} functions (false: user-space code is simulated, SYS_futex is emulated)
nprocs = 0                # Overwrite emulated get_nprocs() call (default: return simulated number of cores)
clock_replace = true      # Whether to replace gettimeofday() and friends to return simulated time rather than host wall time
time_start = 1337000000   # Simulator startup time ("time zero") for emulated gettimeofday()

[traceinput]
enabled = false
address_randomization = false # Randomize upper address bits on a per-application basis to avoid cache set contention when running multiple copies of the same trace
stop_with_first_app = true    # Simulation ends when first application ends (else: when last application ends)
restart_apps = false          # When stop_with_first_app=false, whether to restart applications until the longest-running app completes for the first time
mirror_output = false
trace_prefix = ""             # Disable trace file prefixes (for trace and response fifos) by default
num_runs = 1                  # Add 1 for warmup, etc

[scheduler]
type = pinned

[scheduler/pinned]
quantum = 1000000         # Scheduler quantum (round-robin for active threads on each core), in nanoseconds
core_mask = 1             # Mask of cores on which threads can be scheduled (default: 1, all cores)
interleaving = 1          # Interleaving of round-robin initial assignment (e.g. 2 => 0,2,4,6,1,3,5,7)

[scheduler/roaming]
quantum = 1000000         # Scheduler quantum (round-robin for active threads on each core), in nanoseconds
core_mask = 1             # Mask of cores on which threads can be scheduled (default: 1, all cores)

[scheduler/static]
core_mask = 1             # Mask of cores on which threads can be scheduled (default: 1, all cores)

[scheduler/big_small]
quantum = 1000000         # Scheduler quantum, in nanoseconds
debug = false

[hooks]
numscripts = 0

[fault_injection]
type = none
injector = none

[routine_tracer]
type = none

[instruction_tracer]
type = none

[sampling]
enabled = false
